import numpy as np
import pandas as pd
%%time
raw_data = pd.read_csv('yellow_tripdata_2016-05.csv', parse_dates=['tpep_pickup_datetime', 'tpep_dropoff_datetime'])
raw_data.head()
raw_data.shape
%%time
data = raw_data[(raw_data.tpep_pickup_datetime != raw_data.tpep_dropoff_datetime)&(raw_data.passenger_count != 0)
&(raw_data.trip_distance != 0)]
data.shape
def is_inside_NY(p1, p2):
a1 = -74.25559
a2 = -73.70001
b1 = 40.49612
b2 = 40.91553
if ((p1 < a2)&(p1 > a1)&(p2 < b2)&(p2 > b1)):
return True
else:
return False
%%time
data = data[list(map(is_inside_NY, data.pickup_longitude, data.pickup_latitude))]
data.shape
from scipy import stats
from datetime import datetime, timedelta
start_time = datetime.strptime('2016-05-01 00', "%Y-%m-%d %H")
end_time = datetime.strptime('2016-05-31 23', "%Y-%m-%d %H")
cur_time = start_time
x = np.linspace(-74.25559, -73.70001, 51)
y = np.linspace(40.49612, 40.91553, 51)
data_agregated = pd.DataFrame(columns=['time', 'region', 'num_of_rides'])
while(cur_time <= end_time):
cur_time_plus_one = cur_time + timedelta(hours = 1)
data_cur_time = data[(data.tpep_pickup_datetime >= cur_time)&(data.tpep_pickup_datetime < cur_time_plus_one)]
if (len(data_cur_time) > 0):
data_temp = stats.binned_statistic_2d(data_cur_time.pickup_longitude, data_cur_time.pickup_latitude,
None, 'count', bins=[x, y])
data_temp1 = data_temp.statistic.reshape((2500,))
else:
data_temp1 = [0]*2500
data_agregated_temp = pd.DataFrame({'time': cur_time, 'region': range(1, 2501), 'num_of_rides': data_temp1})
data_agregated = data_agregated.append(data_agregated_temp, ignore_index=True)
cur_time += timedelta(hours = 1)
data_agregated = data_agregated[['time', 'region', 'num_of_rides']]
data_agregated.shape
data_agregated.head(10)
empire_state_building = (-73.98583, 40.74778)
data_temp = stats.binned_statistic_2d([empire_state_building[0]], [empire_state_building[1]],
None, 'count', bins=[x, y])
data_temp1 = data_temp.statistic.reshape((2500,))
empire_state_building_bin = data_temp1.argmax() + 1
print('Region of the Empire State Building:', empire_state_building_bin)
data_empire = data_agregated[data_agregated.region == empire_state_building_bin]
import matplotlib.pylab as plt
%pylab inline
plt.figure(figsize(15, 8))
plt.plot(data_empire.time, data_empire.num_of_rides)
plt.ylabel('Number of rides')
plt.xlabel('Time')
plt.title('Number of rides from the Empire State Building location by hours')
plt.show()
print('Number of pairs (hour-region) with no rides: ', data_agregated[data_agregated.num_of_rides == 0].shape[0])
%%time
data_agregated.to_csv('data_agregated_may_2016.csv', index=None)
Загрузим агрегированные данные о поездках в мае 2016. Просуммируем общее количество поездок такси из каждой географической зоны и посчитаем количество ячеек, из которых в мае не было совершено ни одной поездки.
import numpy as np
import pandas as pd
%%time
data = pd.read_csv('data_agregated_may_2016.csv')
data.shape
rides_by_region = data.groupby(['region'])['num_of_rides'].sum()
print('Number of regions with no rides in May: ',
rides_by_region[lambda x: x == 0].shape[0])
Нарисуем статическую карту Нью-Йорка. Точка, где находится Эмпайр-Стейт-Билдинг.
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize = (18, 18))
m = Basemap(urcrnrlon=-73.70001, llcrnrlon=-74.25559, urcrnrlat=40.91553, llcrnrlat=40.49612)
m.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels=3200)
empire_state_building = (40.74778, -73.98583)
plt.scatter(empire_state_building[1], empire_state_building[0], s=250, c='r')
plt.show()
Поверх статической карты Нью-Йорка визуализируем данные о поездках из каждой ячейки.
r1 = rides_by_region.values.reshape(50, 50).T
x = np.linspace(-74.25559, -73.70001, 51)
y = np.linspace(40.49612, 40.91553, 51)
lon, lat = np.meshgrid(x, y)
fig = plt.figure(figsize=(18, 15))
m = Basemap(urcrnrlon=-73.70001, llcrnrlon=-74.25559, urcrnrlat=40.91553, llcrnrlat=40.49612)
m.arcgisimage(service='ESRI_StreetMap_World_2D', xpixels=3200)
m.pcolormesh(lon, lat, r1, latlon=True, cmap='viridis_r', alpha=0.7)
m.colorbar(label='number of rides');
plt.show()
Интерактивная карта Нью-Йорка
import gmaps
gmaps.configure(api_key="AIzaSyC2-yHnSBA7fDgWb4TrW9IhpIfbOqmgUvY")
statue_of_liberty = [(40.689262, -74.044490)]
fig = gmaps.figure(center=statue_of_liberty[0], zoom_level=13)
fig.add_layer(gmaps.marker_layer(statue_of_liberty))
fig
На интерактивной карте Нью-Йорка: цвет показывает среднее за месяц количество поездок такси в час из этой зоны.
import gmaps.geojson_geometries
import json
import copy
num_of_hours_in_may = 31*24.
rides_by_region_per_hour = rides_by_region/num_of_hours_in_may
json_data_1 = '{"features":[],"type": "FeatureCollection"}'
json_data_2 = '{"type": "Feature", "geometry": {"type": "Polygon","coordinates": [[]]}}'
my_geometry = json.loads(json_data_1)
j0 = json.loads(json_data_2)
regions_data = pd.read_csv('regions.csv', sep=';')
for region in range(1, 2501):
west = regions_data['west'][region-1]
east = regions_data['east'][region-1]
south = regions_data['south'][region-1]
north = regions_data['north'][region-1]
new_region = copy.deepcopy(j0)
new_region['geometry']['coordinates'][0] = [[west, south],
[west, north],
[east, north],
[east, south],
[west, south]]
my_geometry['features'].append(new_region)
len(my_geometry['features'])
from matplotlib.cm import viridis
from matplotlib.colors import to_hex
min_rides = rides_by_region_per_hour.min()
max_rides = rides_by_region_per_hour.max()
rides_range = max_rides - min_rides
def calculate_color(num_of_rides):
# make gini a number between 0 and 1
normalized_rides = (num_of_rides - min_rides) / rides_range
# invert gini so that high inequality gives dark color
inverse_rides = 1.0 - normalized_rides
# transform the gini coefficient to a matplotlib color
mpl_color = viridis(inverse_rides)
# transform from a matplotlib color to a valid CSS color
gmaps_color = to_hex(mpl_color, keep_alpha=False)
return gmaps_color
colors = []
for num_of_rides in rides_by_region_per_hour:
colors.append(calculate_color(num_of_rides))
fig = gmaps.figure()
geojson_layer = gmaps.geojson_layer(my_geometry,
fill_color=colors,
stroke_color=colors,
fill_opacity=0.8)
fig.add_layer(geojson_layer)
fig
Отфильтруем ячейки, из которых в мае совершается в среднем меньше 5 поездок в час.
rides_by_region_per_hour = rides_by_region_per_hour[lambda x: x >= 5]
print('Number of regions with 5 and more rides per hour:', rides_by_region_per_hour.shape[0])
my_geometry_2 = json.loads(json_data_1)
for region in rides_by_region_per_hour.keys():
west = regions_data['west'][region-1]
east = regions_data['east'][region-1]
south = regions_data['south'][region-1]
north = regions_data['north'][region-1]
new_region = copy.deepcopy(j0)
new_region['geometry']['coordinates'][0] = [[west, south],
[west, north],
[east, north],
[east, south],
[west, south]]
my_geometry_2['features'].append(new_region)
fig3 = gmaps.figure()
geojson_layer = gmaps.geojson_layer(my_geometry_2,
fill_color='blue',
stroke_color='blue',
fill_opacity=0.8)
print('Regions with 5 and more rides per hour:')
fig3.add_layer(geojson_layer)
fig3
import numpy as np
import pandas as pd
Выберем район 1231
target_region = 1231
files_agregated_data = [#'data_agregated_11_2015.csv', 'data_agregated_12_2015.csv', 'data_agregated_01_2016.csv',
#'data_agregated_02_2016.csv', 'data_agregated_03_2016.csv',
'data_agregated_04_2016.csv',
'data_agregated_05_2016.csv']
data = pd.DataFrame()
for f in files_agregated_data:
raw_data = pd.read_csv(f)
data = data.append(raw_data[raw_data.region == target_region][['time', 'num_of_rides']])
data.head()
data.shape
import matplotlib.pylab as plt
%pylab inline
plt.figure(figsize(15, 8))
#plt.plot(range(data.shape[0]), data.num_of_rides)
plt.plot(range(500), data.num_of_rides.iloc[:500])
plt.ylabel('Number of rides')
plt.xlabel('Time')
plt.title('Number of rides from the Empire State Building location by hours')
plt.show()
Создаем регрессионные признаки
K = 5
T = data.shape[0]
s = np.empty((0, T))
c = np.empty((0, T))
for i in range(K):
s_i = np.arange(1, T+1)
c_i = np.arange(1, T+1)
s_i = np.sin(s_i*2*np.pi*(i+1)/168)
c_i = np.cos(c_i*2*np.pi*(i+1)/168)
s = np.vstack((s, s_i))
c = np.vstack((c, c_i))
s = s.T
c = c.T
Regr_X = np.hstack((s, c))
Regr_y = data.num_of_rides.values
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(Regr_X, Regr_y)
Regr_y_predict = lr.predict(Regr_X)
plt.figure(figsize(15, 8))
#plt.plot(range(data.shape[0]), data.num_of_rides)
#plt.plot(range(data.shape[0]), Regr_y_predict, c='r')
plt.plot(range(500), data.num_of_rides.iloc[:500])
plt.plot(range(500), Regr_y_predict[:500], c='r')
plt.ylabel('Number of rides')
plt.xlabel('Time')
plt.title('Number of rides from the Empire State Building location by hours')
plt.show()
y = data.num_of_rides - Regr_y_predict
plt.figure(figsize(15, 8))
#plt.plot(range(data.shape[0]), y)
plt.plot(range(500), y[:500])
plt.ylabel('Number of rides')
plt.xlabel('Time')
plt.title('')
plt.show()
from scipy import stats
import statsmodels.api as sm
from itertools import product
Проверка стационарности и STL-декомпозиция ряда:
y = pd.DataFrame(y)
y.index = pd.DatetimeIndex(data.time)
plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(y.num_of_rides[:500]).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(y.num_of_rides[:500])[1])
Сделаем сезонное дифференцирование - по часам
y['y_diff'] = y.num_of_rides - y.num_of_rides.shift(24)
plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(y.y_diff[24:524]).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(y.y_diff[24:])[1])
Сделаем еще одно дифференцирование
y['y_diff2'] = y.y_diff - y.y_diff.shift(1)
plt.figure(figsize(15,10))
sm.tsa.seasonal_decompose(y.y_diff2[25:525]).plot()
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(y.y_diff2[25:525])[1])
plt.figure(figsize(15,8))
ax = plt.subplot(211)
sm.graphics.tsa.plot_acf(y.y_diff2[25:].values.squeeze(), lags=180, ax=ax)
pylab.show()
ax = plt.subplot(212)
sm.graphics.tsa.plot_pacf(y.y_diff2[25:].values.squeeze(), lags=168, ax=ax)
pylab.show()
Q = 7, q = 9, P = 6, p = 10
Возмем следующие параметры: Q=3, q=1, P=1, p=1
ps = range(0, 2)
d=1
qs = range(0, 2)
Ps = range(0, 2)
D=1
Qs = range(3, 4)
parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)
%%time
results = []
best_aic = float("inf")
warnings.filterwarnings('ignore')
for param in parameters_list:
#try except нужен, потому что на некоторых наборах параметров модель не обучается
try:
model=sm.tsa.statespace.SARIMAX(y.num_of_rides, order=(param[0], d, param[1]),
seasonal_order=(param[2], D, param[3], 24)).fit(disp=-1)
print('Ok parameters:', param)
#выводим параметры, на которых модель не обучается и переходим к следующему набору
except ValueError:
print('wrong parameters:', param)
continue
aic = model.aic
#сохраняем лучшую модель, aic, параметры
if aic < best_aic:
best_model = model
best_aic = aic
best_param = param
results.append([param, model.aic])
warnings.filterwarnings('default')
result_table = pd.DataFrame(results)
result_table.columns = ['parameters', 'aic']
print(result_table.sort_values(by = 'aic', ascending=True).head(10))
print()
print(best_model.summary())
Остатки
plt.figure(figsize(15,8))
plt.subplot(211)
best_model.resid[25:].plot()
plt.ylabel(u'Residuals')
ax = plt.subplot(212)
sm.graphics.tsa.plot_acf(best_model.resid[25:].values.squeeze(), lags=168, ax=ax)
print("Критерий Стьюдента: p=%f" % stats.ttest_1samp(best_model.resid[25:], 0)[1])
print("Критерий Дики-Фуллера: p=%f" % sm.tsa.stattools.adfuller(best_model.resid[25:])[1])
y['model'] = best_model.fittedvalues
#y['model_regr'] = best_model.fittedvalues + Regr_y_predict
plt.figure(figsize(15,7))
y.num_of_rides[25:525].plot()
y.model[25:525].plot(color='r')
plt.ylabel('Num of rides')
pylab.show()
pred = y.model.values + Regr_y_predict
plt.figure(figsize(15,7))
#data.num_of_rides.plot()
#data.model[25:].plot(color='r')
plt.plot(data.num_of_rides[25:525].values)
plt.plot(pred[25:525])
plt.ylabel('Num of rides')
pylab.show()